In [1]:
%pylab inline
import numbapro
from numbapro import *


Populating the interactive namespace from numpy and matplotlib

In [4]:
numbapro.check_cuda()


------------------------------libraries detection-------------------------------
Finding cublas
	located at /home/chiroptera/anaconda/lib/libcublas.so.6.0.37
	trying to open library...	ok
Finding cusparse
	located at /home/chiroptera/anaconda/lib/libcusparse.so.6.0.37
	trying to open library...	ok
Finding cufft
	located at /home/chiroptera/anaconda/lib/libcufft.so.6.0.37
	trying to open library...	ok
Finding curand
	located at /home/chiroptera/anaconda/lib/libcurand.so.6.0.37
	trying to open library...	ok
Finding nvvm
	located at /home/chiroptera/anaconda/lib/libnvvm.so.2.0.0
	trying to open library...	ok
	finding libdevice for compute_20...	ok
	finding libdevice for compute_30...	ok
	finding libdevice for compute_35...	ok
-------------------------------hardware detection-------------------------------
Found 1 CUDA devices
id 0         GeForce GT 520M                              [SUPPORTED]
                      compute capability: 2.1
                           pci device id: 0
                              pci bus id: 1
Summary:
	1/1 devices are supported
PASSED
Out[4]:
True

In [2]:
%load https://raw.githubusercontent.com/Chiroptera/QCThesis/master/CUDA/K_Means.py

In [2]:
"""
Created on Fri Mar 27 08:53:20 2015

@author: Diogo Silva
"""

import numpy as np
import numbapro
from numbapro import *

class K_Means:       
       
    
    def __init__(self,N=None,D=None,K=None):
        self.N = N
        self.D = D
        self.K = K
        
        self._cudaDataRef = None
        
        self.__cuda = True
        self.__cuda_mem = "auto"

    @property    
    def cuda_mem(self):
        return self.__cuda_mem

    @cuda_mem.setter
    def cuda_mem(self,cuda_mem):
        if cuda_mem not in ['manual','auto']:
            raise Exception("cuda_mem = \'manual\' or \'auto\'")
    
    def fit(self,data,K,iters=3,cuda=True):
 
        if iters == 0:
            return
       
        N,D = data.shape
            
        self.N = N
        self.D = D
        self.K = K
        
        centroids = self._init_centroids(data)
        
        for i in xrange(iters):
            dist_mat = self._calc_dists(data,centroids,cuda=cuda)
            assign,grouped_data = self._assign_data(data,dist_mat)
            centroids =  self._np_recompute_centroids(grouped_data)
            self.centroids = centroids

    def _init_centroids(self,data):
        
        centroids = np.empty((self.K,self.D),dtype=data.dtype)
        random_init = np.random.randint(0,self.N,self.K)
        self.init_seed = random_init
        
        for k in xrange(self.K):
            centroids[k] = data[random_init[k]]
        
        self.centroids = centroids
        
        return centroids

    def _calc_dists(self,data,centroids,cuda=False):
        if cuda:
            dist_mat = self._cu_calc_dists(data,centroids,gridDim=None,
                                           blockDim=None,memManage='manual')
        else:
            dist_mat = self._np_calc_dists(data,centroids)
            
        return dist_mat
            
    def _py_calc_dists(data,centroids):
        N,D = data.shape
        K,cD = centroids.shape

        for n in range(N):
            for k in range(K):
                dist=0
                for d in range(dim):
                    diff = a[n,d]-b[k,d]
                    dist += diff ** 2
                c[n,k]=dist
            
    def _np_calc_dists(self,data,centroids):
        """
        NumPy implementation - much faster than vanilla Python
        """
        N,D = data.shape
        K,cD = centroids.shape

        dist_mat = np.empty((N,K),dtype=data.dtype)    
        
        for k in xrange(K):
            dist = data - centroids[k]
            dist = dist ** 2
            dist_mat[:,k] = dist.sum(axis=1)
            
        return dist_mat
    
    def _cu_calc_dists(self,data,centroids,gridDim=None,blockDim=None,
                       memManage='auto',keepDataRef=True):
        """
        TODO:
            - deal with gigantic data / distance matrix
            - deal with heavely assymetric distance matrix
                - if the number of blocks on any given dimension of 
                the grid > 2**16, divide that dimension by another dimension
                - don't forget to change the index computation in the kernel
        """
        
        
        N,D = data.shape
        K,cD = centroids.shape
        
        self.cuda_mem = memManage
        
        if self.__cuda_mem  not in ('manual','auto'):
            raise Exception("Invalid value for \'memManage\'.")

            
        if gridDim is None or blockDim is None:
            #dists shape
            

            MAX_THREADS_BLOCK = 16 * 20 # GT520M has 48 CUDA cores
            MAX_GRID_XYZ_DIM = 65535

            if K <= 28:
                blockWidth = K
                blockHeight = np.floor(MAX_THREADS_BLOCK / blockWidth)
                blockHeight = np.int(blockHeight)
            else:
                blockWidth = 20
                blockHeight = 16

            # grid width/height is the number of blocks necessary to fill
            # the columns/rows of the matrix
            gridWidth = np.ceil(np.float(K) / blockWidth)
            gridHeight = np.ceil(np.float(N) / blockHeight)

    
            blockDim = blockWidth, blockHeight
            gridDim = np.int(gridWidth), np.int(gridHeight)
        
        self.blockDim = blockDim
        self.gridDim = gridDim        
        
        distShape =  N,K
        dist_mat = np.empty(distShape,dtype=data.dtype)
        
        if self.__cuda_mem == 'manual':
            
            if keepDataRef:
                if self._cudaDataRef is None:
                    dData = cuda.to_device(data)
                    self._cudaDataRef = dData
                else:
                    dData = self._cudaDataRef
            else:
                dData = cuda.to_device(data)
                
            dCentroids = cuda.to_device(centroids)
            dDists = numbapro.cuda.device_array_like(dist_mat)
            
            self._cu_dist_kernel[gridDim,blockDim](dData,dCentroids,dDists)        
        
            dDists.copy_to_host(ary=dist_mat)
            numbapro.cuda.synchronize()

        elif self.__cuda_mem == 'auto':
            self._cu_dist_kernel[gridDim,blockDim](data,centroids,dist_mat) 
        
        return dist_mat
        
    
    @numbapro.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])")
    def _cu_dist_kernel(a,b,c):
        k,n = numbapro.cuda.grid(2)

        ch, cw = c.shape # c width and height

        if n >= ch or k >= cw:
            return

        dist = 0.0
        for d in range(a.shape[1]):
            diff = a[n,d]-b[k,d]
            dist += diff ** 2
        c[n,k]= dist
    
        
    def _assign_data(self,data,dist_mat):
        
        N,K = dist_mat.shape
        
        assign = np.argmin(dist_mat,axis=1)
        
        grouped_data=[[] for i in xrange(K)]
        
        
        for n in xrange(N):
            # add datum i to its assigned cluster assign[i]
            grouped_data[assign[n]].append(data[n])
        
        for k in xrange(K):
            grouped_data[k] = np.array(grouped_data[k])
        
        return assign,grouped_data
    
    def _np_recompute_centroids(self,grouped_data):
        
        # change to get dimension from class or search a non-empty cluster
        #dim = grouped_data[0][0].shape[1]
        dim = self.D
        K = len(grouped_data)
        
        centroids = np.empty((K,dim))
        for k in xrange(K):
            centroids[k] = np.mean(grouped_data[k],axis=0)
        
        return centroids


    def _cu_mean(self):

        pass


---------------------------------------------------------------------------
CudaSupportError                          Traceback (most recent call last)
<ipython-input-2-f64f068dcd5d> in <module>()
      9 from numbapro import *
     10 
---> 11 class K_Means:
     12 
     13 

<ipython-input-2-f64f068dcd5d> in K_Means()
    176 
    177 
--> 178     @numbapro.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])")
    179     def _cu_dist_kernel(a,b,c):
    180         k,n = numbapro.cuda.grid(2)

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/decorators.pyc in kernel_jit(func)
    122             # Force compilation for the current context
    123             if bind:
--> 124                 kernel.bind()
    125 
    126             return kernel

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.pyc in bind(self)
    311         Force binding to current CUDA context
    312         """
--> 313         self._func.get()
    314 
    315     @property

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.pyc in get(self)
    249 
    250     def get(self):
--> 251         cuctx = get_context()
    252         device = cuctx.device
    253         cufunc = self.cache.get(device.id)

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/devices.pyc in get_context(devnum)
    226     return the CUDA context.
    227     """
--> 228     return _runtime.get_or_create_context(devnum)
    229 
    230 

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/devices.pyc in get_or_create_context(self, devnum)
    188             return self.current_context
    189         else:
--> 190             return _runtime.push_context(self.gpus[devnum])
    191 
    192     def reset(self):

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/devices.pyc in __getitem__(self, devnum)
     35 
     36     def __getitem__(self, devnum):
---> 37         return self.lst[devnum]
     38 
     39     def __str__(self):

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/devices.pyc in __getattr__(self, attr)
     24             # Device list is not initialized.
     25             # Query all CUDA devices.
---> 26             numdev = driver.get_device_count()
     27             gpus = [_DeviceContextManager(driver.get_device(devid))
     28                     for devid in range(numdev)]

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.pyc in get_device_count(self)
    254     def get_device_count(self):
    255         count = c_int()
--> 256         self.cuDeviceGetCount(byref(count))
    257         return count.value
    258 

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.pyc in __getattr__(self, fname)
    199         # Initialize driver
    200         if not self.is_initialized:
--> 201             self.initialize()
    202 
    203         if self.initialization_error is not None:

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.pyc in initialize(self)
    180         except CudaAPIError as e:
    181             self.initialization_error = e
--> 182             raise CudaSupportError("Error at driver init: \n%s:" % e)
    183 
    184     @property

CudaSupportError: Error at driver init: 
Call to cuInit results in CUDA_ERROR_UNKNOWN:

In [125]:
##generate data
n = 1e6
d = 2
k = 20

## Generate data
from sklearn import datasets
#data = np.random.random((n,d)).astype(np.float32)
data, groundTruth = datasets.make_blobs(n_samples=np.int(n),n_features=d,centers=k,
                                        center_box=(-1000.0,1000.0))

total_bytes = (n * d + k * d + n * k) * 4
print 'Memory used by arrays:\t',total_bytes/1024,'\tKBytes'
print '\t\t\t',total_bytes/(1024*1024),'\tMBytes'

print 'Memory used by data:  \t',n * d * 4 / 1024,'\t','KBytes'

data = np.random.random((n,d)).astype(np.float32)


Memory used by arrays:	85937.65625 	KBytes
			83.9234924316 	MBytes
Memory used by data:  	7812.5 	KBytes

In [6]:
from timeit import default_timer as timer

In [7]:
times=dict()

In [127]:
start = timer()
grouperCUDA = K_Means()
grouperCUDA.fit(data,k,iters=3,cuda=True)
times['cuda'] = timer() - start
print times['cuda']


4.95243692398

In [129]:
#%debug
start = timer()
grouperNP = K_Means()
grouperNP.fit(data,k,cuda=False)
times['numpy'] = timer() - start
print times['numpy']


8.96777701378

In [10]:
print 'Times'
print 'CUDA ','\t',times['cuda']
print 'NumPy','\t',times['numpy']


Times
CUDA  	4.96065402031
NumPy 	8.92235088348

In [11]:
import cProfile
#from line_profiler import LineProfiler

In [12]:
#profile = LineProfiler(grouperCUDA.fit(data,k,iters=3,cuda=True))
#profile.print_stats()
cProfile.run("grouperCUDA.fit(data,k,iters=3,cuda=True)")


         3003022 function calls (3002977 primitive calls) in 5.397 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        3    0.000    0.000    0.710    0.237 <ipython-input-4-994d974b26e9>:102(_cu_calc_dists)
        3    2.802    0.934    4.541    1.514 <ipython-input-4-994d974b26e9>:194(_assign_data)
        3    0.000    0.000    0.126    0.042 <ipython-input-4-994d974b26e9>:212(_np_recompute_centroids)
        1    0.012    0.012    5.391    5.391 <ipython-input-4-994d974b26e9>:33(fit)
        1    0.001    0.001    0.001    0.001 <ipython-input-4-994d974b26e9>:52(_init_centroids)
        3    0.000    0.000    0.710    0.237 <ipython-input-4-994d974b26e9>:65(_calc_dists)
        1    0.006    0.006    5.397    5.397 <string>:1(<module>)
        9    0.000    0.000    0.000    0.000 <string>:8(__new__)
       18    0.000    0.000    0.001    0.000 _methods.py:34(_prod)
       60    0.000    0.000    0.000    0.000 _methods.py:43(_count_reduce_items)
       60    0.001    0.000    0.126    0.002 _methods.py:53(_mean)
       18    0.002    0.000    0.002    0.000 arrayobj.py:46(make_array_ctype)
       18    0.000    0.000    0.000    0.000 arrayobj.py:61(c_array)
        3    0.000    0.000    0.000    0.000 compiler.py:172(copy)
        3    0.000    0.000    0.000    0.000 compiler.py:175(configure)
        3    0.000    0.000    0.000    0.000 compiler.py:201(__getitem__)
        3    0.000    0.000    0.000    0.000 compiler.py:250(get)
        3    0.000    0.000    0.710    0.237 compiler.py:301(__call__)
        3    0.000    0.000    0.709    0.236 compiler.py:326(_kernel_call)
        9    0.000    0.000    0.085    0.009 compiler.py:377(_prepare_args)
        9    0.000    0.000    0.623    0.069 compiler.py:381(<lambda>)
        9    0.000    0.000    0.000    0.000 contextlib.py:12(__init__)
        9    0.000    0.000    0.000    0.000 contextlib.py:15(__enter__)
        9    0.000    0.000    0.000    0.000 contextlib.py:21(__exit__)
        9    0.000    0.000    0.000    0.000 contextlib.py:82(helper)
        6    0.000    0.000    0.000    0.000 copy.py:306(_reconstruct)
        6    0.000    0.000    0.001    0.000 copy.py:66(copy)
        6    0.000    0.000    0.000    0.000 copy_reg.py:92(__newobj__)
        9    0.000    0.000    0.000    0.000 devicearray.py:123(__del__)
       18    0.000    0.000    0.000    0.000 devicearray.py:129(_default_stream)
       18    0.000    0.000    0.000    0.000 devicearray.py:141(device_ctypes_pointer)
        9    0.000    0.000    0.073    0.008 devicearray.py:151(copy_to_device)
        9    0.000    0.000    0.623    0.069 devicearray.py:168(copy_to_host)
        9    0.000    0.000    0.000    0.000 devicearray.py:232(as_cuda_arg)
        9    0.000    0.000    0.012    0.001 devicearray.py:332(from_array_like)
        9    0.000    0.000    0.000    0.000 devicearray.py:347(sentry_contiguous)
        9    0.000    0.000    0.085    0.009 devicearray.py:353(auto_device)
        9    0.000    0.000    0.012    0.001 devicearray.py:56(__init__)
       21    0.000    0.000    0.001    0.000 devices.py:108(current_context)
       21    0.000    0.000    0.001    0.000 devices.py:183(get_or_create_context)
       21    0.000    0.000    0.001    0.000 devices.py:224(get_context)
        3    0.000    0.000    0.000    0.000 driver.py:1018(configure)
        3    0.000    0.000    0.000    0.000 driver.py:1035(__call__)
        3    0.000    0.000    0.000    0.000 driver.py:1070(launch_kernel)
       27    0.000    0.000    0.000    0.000 driver.py:1247(host_pointer)
        9    0.000    0.000    0.000    0.000 driver.py:1258(host_memory_extents)
        9    0.000    0.000    0.000    0.000 driver.py:1263(memory_size_from_info)
        9    0.000    0.000    0.000    0.000 driver.py:1273(host_memory_size)
       36    0.000    0.000    0.001    0.000 driver.py:1280(device_pointer)
       45    0.000    0.000    0.001    0.000 driver.py:1285(device_ctypes_pointer)
       72    0.000    0.000    0.000    0.000 driver.py:1293(is_device_memory)
       45    0.000    0.000    0.000    0.000 driver.py:1304(require_device_memory)
        9    0.000    0.000    0.000    0.000 driver.py:1311(device_memory_depends)
       18    0.000    0.000    0.074    0.004 driver.py:1321(host_to_device)
        9    0.000    0.000    0.623    0.069 driver.py:1339(device_to_host)
       87    0.700    0.008    0.700    0.008 driver.py:212(safe_cuda_api_call)
       87    0.000    0.000    0.000    0.000 driver.py:241(_check_error)
       21    0.000    0.000    0.000    0.000 driver.py:270(get_context)
        9    0.000    0.000    0.000    0.000 driver.py:291(add_trash)
        9    0.000    0.000    0.002    0.000 driver.py:294(process)
        9    0.000    0.000    0.004    0.000 driver.py:498(memalloc)
        9    0.000    0.000    0.000    0.000 driver.py:664(_make_mem_finalizer)
        9    0.000    0.000    0.000    0.000 driver.py:665(mem_finalize)
        9    0.000    0.000    0.000    0.000 driver.py:669(core)
        9    0.000    0.000    0.002    0.000 driver.py:670(cleanup)
        9    0.000    0.000    0.000    0.000 driver.py:728(__init__)
        9    0.000    0.000    0.000    0.000 driver.py:739(__del__)
        9    0.000    0.000    0.000    0.000 driver.py:746(own)
        9    0.000    0.000    0.000    0.000 driver.py:749(free)
       45    0.000    0.000    0.000    0.000 driver.py:778(device_ctypes_pointer)
        9    0.000    0.000    0.000    0.000 driver.py:838(__init__)
        9    0.000    0.000    0.000    0.000 driver.py:847(__del__)
       90    0.000    0.000    0.000    0.000 driver.py:858(__getattr__)
        9    0.000    0.000    0.000    0.000 driver.py:911(query)
        9    0.000    0.000    0.000    0.000 driver.py:925(record)
       18    0.000    0.000    0.000    0.000 dummyarray.py:104(is_contiguous)
       18    0.000    0.000    0.000    0.000 dummyarray.py:108(compute_index)
       54    0.000    0.000    0.000    0.000 dummyarray.py:109(<genexpr>)
        9    0.000    0.000    0.001    0.000 dummyarray.py:148(from_desc)
        9    0.000    0.000    0.001    0.000 dummyarray.py:157(__init__)
       27    0.000    0.000    0.000    0.000 dummyarray.py:160(<genexpr>)
       27    0.000    0.000    0.000    0.000 dummyarray.py:161(<genexpr>)
        9    0.000    0.000    0.000    0.000 dummyarray.py:167(_compute_layout)
        9    0.000    0.000    0.000    0.000 dummyarray.py:172(is_contig)
        9    0.000    0.000    0.000    0.000 dummyarray.py:184(_compute_extent)
       18    0.000    0.000    0.000    0.000 dummyarray.py:27(__init__)
       36    0.000    0.000    0.000    0.000 dummyarray.py:80(get_offset)
       18    0.000    0.000    0.001    0.000 fromnumeric.py:2259(prod)
       60    0.000    0.000    0.126    0.002 fromnumeric.py:2651(mean)
        3    0.000    0.000    0.149    0.050 fromnumeric.py:946(argmin)
        9    0.000    0.000    0.005    0.001 ndarray.py:130(ndarray_populate_head)
        9    0.000    0.000    0.000    0.000 ndarray.py:42(__new__)
       18    0.000    0.000    0.000    0.000 ndarray.py:69(get_stage)
        9    0.000    0.000    0.001    0.000 ndarray.py:80(allocate)
        9    0.000    0.000    0.000    0.000 ndarray.py:91(free)
        9    0.000    0.000    0.002    0.000 ndarray.py:96(write)
       60    0.000    0.000    0.000    0.000 numeric.py:464(asanyarray)
        9    0.000    0.000    0.002    0.000 service.py:18(service)
       42    0.000    0.000    0.000    0.000 threadlocal.py:13(stack)
       21    0.000    0.000    0.000    0.000 threadlocal.py:29(top)
       21    0.000    0.000    0.000    0.000 threadlocal.py:33(is_empty)
       21    0.000    0.000    0.000    0.000 threadlocal.py:37(__bool__)
       21    0.000    0.000    0.000    0.000 threadlocal.py:40(__nonzero__)
        9    0.000    0.000    0.000    0.000 utils.py:142(__setitem__)
        9    0.000    0.000    0.000    0.000 {_ctypes.addressof}
       30    0.000    0.000    0.000    0.000 {_ctypes.byref}
       18    0.000    0.000    0.000    0.000 {_ctypes.sizeof}
       38    0.000    0.000    0.000    0.000 {_warnings.warn}
       18    0.000    0.000    0.000    0.000 {_weakref.proxy}
       15    0.000    0.000    0.000    0.000 {built-in method __new__ of type object at 0x7f4c0a07ad00}
  183/138    0.000    0.000    0.000    0.000 {getattr}
       44    0.000    0.000    0.000    0.000 {hasattr}
        6    0.000    0.000    0.000    0.000 {id}
      198    0.000    0.000    0.000    0.000 {isinstance}
       60    0.000    0.000    0.000    0.000 {issubclass}
      102    0.000    0.000    0.000    0.000 {len}
        6    0.000    0.000    0.000    0.000 {method '__reduce_ex__' of 'object' objects}
        9    0.000    0.000    0.000    0.000 {method 'add' of 'set' objects}
       18    0.000    0.000    0.000    0.000 {method 'append' of 'collections.deque' objects}
  3000051    0.270    0.000    0.270    0.000 {method 'append' of 'list' objects}
        3    0.149    0.050    0.149    0.050 {method 'argmin' of 'numpy.ndarray' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        9    0.000    0.000    0.000    0.000 {method 'discard' of 'set' objects}
        9    0.000    0.000    0.000    0.000 {method 'extend' of 'list' objects}
       24    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
        9    0.000    0.000    0.000    0.000 {method 'pop' of 'list' objects}
       18    0.000    0.000    0.000    0.000 {method 'popleft' of 'collections.deque' objects}
        1    0.000    0.000    0.000    0.000 {method 'randint' of 'mtrand.RandomState' objects}
       78    0.124    0.002    0.124    0.002 {method 'reduce' of 'numpy.ufunc' objects}
        6    0.000    0.000    0.000    0.000 {method 'update' of 'dict' objects}
        9    0.000    0.000    0.000    0.000 {min}
       18    0.000    0.000    0.002    0.000 {next}
       27    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_buffer}
        9    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_extents_info}
        9    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_extents}
      120    1.321    0.011    1.321    0.011 {numpy.core.multiarray.array}
        7    0.000    0.000    0.000    0.000 {numpy.core.multiarray.empty}
       18    0.000    0.000    0.000    0.000 {sum}
       30    0.000    0.000    0.000    0.000 {zip}



In [ ]:
#profile = LineProfiler(grouperCUDA.fit(data,k,iters=3,cuda=True))
#profile.print_stats()
#cProfile.run("grouperCUDA.fit(data,k,iters=3,cuda=False)")

In [23]:
print 'Speedup:','\t\t', 1.367/0.319
print 'Centroids portion','\t',25.2/31


Speedup: 		4.28526645768
Centroids portion 	0.812903225806

Centroid computation

The